Importing Datasets and Libraries
library(readr)
library(dplyr)
library(glmnet)
library(caret)
library(fastDummies)
library(tidyr)
library(ModelMetrics)
library(ggplot2)
library(lmtest)
library(MASS)
hosp_df = read_csv("new_hospital_with_wages.csv")
mun_df = read_csv("new_municipalities_with_wages.csv")
sch_df = read_csv("new_school_with_wages.csv")
Hospital Dataset
set.seed(1)
# dropping sector, employer, salary paid, and taxable benefits columns
drop = c("Sector","Employer", "Salary Paid", "Taxable Benefits")
hosp_df = hosp_df[,!(names(hosp_df) %in% drop)] %>%
drop_na()
# creating dummy variables for job title
df = dummy_cols(hosp_df, remove_selected_columns = TRUE)
n = nrow(df)*0.7
random_index = sample(nrow(df), n) # creating a 70-30 partition of training and testing data
hosp_train = df[random_index, ]
hosp_test = df[-random_index, ]
x_train_h = data.matrix(hosp_train[-5])
y_train_h = as.vector(unlist(hosp_train[5])) # selecting total compensation as the response variable
x_test_h = data.matrix(hosp_test[-5])
y_test_h = as.vector(unlist(hosp_test[5]))
LINE Assumptions
# cook's distance/influential points
lm_hosp = lm(y_train_h ~ x_train_h)
# residual plot to test for linearity and equal variance
res = resid(lm_hosp)
plot(fitted(lm_hosp), res)
abline(0, 0)

bptest(lm_hosp)
##
## studentized Breusch-Pagan test
##
## data: lm_hosp
## BP = 4439.5, df = 9, p-value < 2.2e-16
# QQ plot to test normal assumption
qqnorm(res)
qqline(res)

# Performing Box-Cox transformation to create the equal variance and normality
bc_data_hosp = data.frame(y = y_train_h, x_train_h)
bc = boxcox(y ~ . , data = bc_data_hosp)

lambda = bc$x[which.max(bc$y)] # finding ideal lambda for boxcox
box_model_hosp = lm(((y^(lambda)-1)/(y)) ~ ., data = bc_data_hosp)
# observing LINE assumptions for boxcox transformed data
res_hosp = resid(box_model_hosp)
plot(fitted(box_model_hosp), res_hosp)
abline(0, 0)

bptest(box_model_hosp)
##
## studentized Breusch-Pagan test
##
## data: box_model_hosp
## BP = 15644, df = 9, p-value < 2.2e-16
qqnorm(res_hosp)
qqline(res_hosp)

Creating a Lasso Regression with k-fold cv
# Lasso - utilize the training partition to perform validation of results
# selecting lambda using cross validation
set.seed(1)
lasso.cv.out_h = cv.glmnet(x = x_train_h, y = y_train_h, alpha = 1)
# plot(lasso.cv.out)
train_hosp = predict(lasso.cv.out_h, s = lasso.cv.out_h$lambda.min, newx = x_train_h)
rmse_hosp_train = rmse(y_train_h, train_hosp)
# rmse_hosp_train = 1.599515e-06
# using lambda from smallest cross validation error
pred_hosp = predict(lasso.cv.out_h, s = lasso.cv.out_h$lambda.min, newx = x_test_h)
rmse_hosp = rmse(y_test_h, pred_hosp)
# rmse_hosp = 1.600682e-06
# R2
rss = sum((pred_hosp - y_test_h) ^ 2) ## residual sum of squares
tss = sum((y_test_h - mean(y_test_h)) ^ 2) ## total sum of squares
rsq_hosp = 1 - rss/tss
# rsq = 0.3077149
Visualizing the Predictions
comp_data = data.frame('True' = y_test_h, 'Pred' = pred_hosp)
ggplot() +
geom_point(aes(x = s1, y = True), data = comp_data, color = "blue")

Municipalities Dataset
set.seed(1)
# dropping sector, employer, salary paid, and taxable benefits columns
drop = c("Sector","Employer", "Salary Paid", "Taxable Benefits")
mun_df = mun_df[,!(names(mun_df) %in% drop)] %>%
drop_na()
# creating dummy variables for job title
df = dummy_cols(mun_df, remove_selected_columns = TRUE)
n = nrow(df)*0.7
random_index = sample(nrow(df), n) # creating a 70-30 partition of training and testing data
mun_train = df[random_index, ]
mun_test = df[-random_index, ]
x_train_m = data.matrix(mun_train[-5])
y_train_m = as.vector(unlist(mun_train[5])) # selecting total compensation as the response variable
x_test_m = data.matrix(mun_test[-5])
y_test_m = as.vector(unlist(mun_test[5]))
LINE Assumptions
# cook's distance/influential points
lm_mun = lm(y_train_m ~ x_train_m)
# residual plot to test for linearity and equal variance
res = resid(lm_mun)
plot(fitted(lm_mun), res)
abline(0, 0)

bptest(lm_mun)
##
## studentized Breusch-Pagan test
##
## data: lm_mun
## BP = 15.805, df = 9, p-value = 0.07106
# QQ plot to test normal assumption
qqnorm(res)
qqline(res)

# Performing Box-Cox transformation to create the equal variance and normality
bc_data_mun = data.frame(y = y_train_m, x_train_m)
bc = boxcox(y ~ . , data = bc_data_mun)

lambda = bc$x[which.max(bc$y)] # finding ideal lambda for boxcox
box_model_mun = lm(((y^(lambda)-1)/(y)) ~ ., data = bc_data_mun)
# observing LINE assumptions for boxcox transformed data
res_mun = resid(box_model_mun)
plot(fitted(box_model_mun), res_mun)
abline(0, 0)

bptest(box_model_mun)
##
## studentized Breusch-Pagan test
##
## data: box_model_mun
## BP = 7310.9, df = 9, p-value < 2.2e-16
qqnorm(res_mun)
qqline(res_mun)

Creating a Lasso Regression with k-fold cv
# Lasso - utilize the training partition to perform validation of results
# selecting lambda using cross validation
set.seed(1)
lasso.cv.out_m = cv.glmnet(x = x_train_m, y = y_train_m, alpha = 1)
# plot(lasso.cv.out)
train_mun = predict(lasso.cv.out_m, s = lasso.cv.out_m$lambda.min, newx = x_train_m)
rmse_mun_train = rmse(y_train_m, train_mun)
# rmse_mun_train = 1.018185e-06
# using lambda from smallest cross validation error
pred_mun = predict(lasso.cv.out_m, s = lasso.cv.out_m$lambda.min, newx = x_test_m)
rmse_mun = rmse(y_test_m, pred_mun)
# rmse_mun = 1.018734e-06
# R2
rss = sum((pred_mun - y_test_m) ^ 2) ## residual sum of squares
tss = sum((y_test_m - mean(y_test_m)) ^ 2) ## total sum of squares
rsq_mun = 1 - rss/tss
# rsq = 0.1665321
Visualizing the Predictions
comp_data = data.frame('True' = y_test_m, 'Pred' = pred_mun)
ggplot() +
geom_point(aes(x = s1, y = True), data = comp_data, color = "blue")

School Dataset
set.seed(1)
# dropping sector, employer, salary paid, and taxable benefits columns
drop = c("Sector","Employer", "Salary Paid", "Taxable Benefits")
sch_df = sch_df[,!(names(sch_df) %in% drop)] %>%
drop_na()
# creating dummy variables for job title
df = dummy_cols(sch_df, remove_selected_columns = TRUE)
# creating train and test partitions
n = nrow(df)*0.7
random_index = sample(nrow(df), n) # creating a 70-30 partition of training and testing data
sch_train = df[random_index, ]
sch_test = df[-random_index, ]
x_train_s = data.matrix(sch_train[-5])
y_train_s = as.vector(unlist(sch_train[5])) # selecting total compensation as the response variable
x_test_s = data.matrix(sch_test[-5])
y_test_s = as.vector(unlist(sch_test[5]))
LINE Assumptions
# cook's distance/influential points
lm_sch = lm(y_train_s ~ x_train_s)
# residual plot to test for linearity and equal variance
res = resid(lm_sch)
plot(fitted(lm_sch), res)
abline(0, 0)

bptest(lm_sch)
##
## studentized Breusch-Pagan test
##
## data: lm_sch
## BP = 862.56, df = 9, p-value < 2.2e-16
# QQ plot to test normal assumption
qqnorm(res)
qqline(res)

# Performing Box-Cox transformation to create the equal variance and normality
bc_data_sch = data.frame(y = y_train_s, x_train_s)
bc = boxcox(y ~ . , data = bc_data_sch)

lambda = bc$x[which.max(bc$y)] # finding ideal lambda for boxcox
box_model_sch = lm(((y^(lambda)-1)/(y)) ~ ., data = bc_data_sch)
# observing LINE assumptions for boxcox transformed data
res_sch = resid(box_model_sch)
plot(fitted(box_model_sch), res_sch)
abline(0, 0)

bptest(box_model_sch)
##
## studentized Breusch-Pagan test
##
## data: box_model_sch
## BP = 2566.3, df = 9, p-value < 2.2e-16
qqnorm(res_sch)
qqline(res_sch)

Creating a Lasso Regression with k-fold cv
# Lasso - utilize the training partition to perform validation of results
# selecting lambda using cross validation
set.seed(1)
lasso.cv.out_s = cv.glmnet(x = x_train_s, y = y_train_s, alpha = 1)
# plot(lasso.cv.out)
train_sch = predict(lasso.cv.out_s, s = lasso.cv.out_s$lambda.min, newx = x_train_s)
rmse_sch_train = rmse(y_train_s, train_sch)
# rmse_sch_train = 8.620759e-07
# using lambda from smallest cross validation error
pred_sch = predict(lasso.cv.out_s, s = lasso.cv.out_s$lambda.min, newx = x_test_s)
rmse_sch = rmse(y_test_s, pred_sch)
# rmse_sch = 8.624815e-07
# R2
rss = sum((pred_sch - y_test_s) ^ 2) ## residual sum of squares
tss = sum((y_test_s - mean(y_test_s)) ^ 2) ## total sum of squares
rsq_sch = 1 - rss/tss
# rsq = 0.2186296
Visualizing the Predictions
comp_data = data.frame('True' = y_test_s, 'Pred' = pred_sch)
ggplot() +
geom_point(aes(x = s1, y = True), data = comp_data, color = "blue")

Comparing the Three Lasso Regression Models
# creating a new dataframe
regr_comp = data.frame(
sector = c("hospitals", "municipalities", "schools"),
train_rmse = c(rmse_hosp_train, rmse_mun_train, rmse_sch_train),
test_rmse = c(rmse_hosp, rmse_mun, rmse_sch),
R2 = c(rsq_hosp, rsq_mun, rsq_sch)
)
regr_comp
## sector train_rmse test_rmse R2
## 1 hospitals 1.599515e-06 1.600682e-06 0.3077149
## 2 municipalities 1.018185e-06 1.018734e-06 0.1665321
## 3 schools 8.620759e-07 8.624815e-07 0.2186296